1   package org.apache.lucene.search.spell;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.util.ArrayList;
21  import java.util.List;
22  import java.util.regex.Pattern;
23  
24  import junit.framework.Assert;
25  
26  import org.apache.lucene.analysis.Analyzer;
27  import org.apache.lucene.analysis.MockAnalyzer;
28  import org.apache.lucene.analysis.MockTokenizer;
29  import org.apache.lucene.document.Document;
30  import org.apache.lucene.document.Field;
31  import org.apache.lucene.index.DirectoryReader;
32  import org.apache.lucene.index.IndexReader;
33  import org.apache.lucene.index.RandomIndexWriter;
34  import org.apache.lucene.index.Term;
35  import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortMethod;
36  import org.apache.lucene.store.Directory;
37  import org.apache.lucene.util.English;
38  import org.apache.lucene.util.IOUtils;
39  import org.apache.lucene.util.LuceneTestCase;
40  import org.apache.lucene.util.TestUtil;
41  
42  public class TestWordBreakSpellChecker extends LuceneTestCase {
43    private Directory dir;
44    private Analyzer analyzer;
45    
46    @Override
47    public void setUp() throws Exception {
48      super.setUp();
49      dir = newDirectory();
50      analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true);
51      RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
52  
53      for (int i = 900; i < 1112; i++) {
54        Document doc = new Document();
55        String num = English.intToEnglish(i).replaceAll("[-]", " ").replaceAll("[,]", "");
56        doc.add(newTextField("numbers", num, Field.Store.NO));
57        writer.addDocument(doc);
58      }
59      
60      {
61        Document doc = new Document();
62        doc.add(newTextField("numbers", "thou hast sand betwixt thy toes", Field.Store.NO));
63        writer.addDocument(doc);
64      }
65      {
66        Document doc = new Document();
67        doc.add(newTextField("numbers", "hundredeight eightyeight yeight", Field.Store.NO));
68        writer.addDocument(doc);
69      }
70      {
71        Document doc = new Document();
72        doc.add(newTextField("numbers", "tres y cinco", Field.Store.NO));
73        writer.addDocument(doc);
74      }
75      
76      writer.commit();
77      writer.close();
78    }
79    
80    @Override
81    public void tearDown() throws Exception {
82      IOUtils.close(dir, analyzer);
83      super.tearDown();
84    } 
85  
86    public void testCombiningWords() throws Exception {
87      IndexReader ir = DirectoryReader.open(dir);
88      WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
89      
90      {        
91        Term[] terms = { 
92            new Term("numbers", "one"),
93            new Term("numbers", "hun"),
94            new Term("numbers", "dred"),
95            new Term("numbers", "eight"),
96            new Term("numbers", "y"),
97            new Term("numbers", "eight"),
98        };
99        wbsp.setMaxChanges(3);
100       wbsp.setMaxCombineWordLength(20);
101       wbsp.setMinSuggestionFrequency(1);
102       CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms, 10, ir, SuggestMode.SUGGEST_ALWAYS);
103       Assert.assertTrue(cs.length==5);
104       
105       Assert.assertTrue(cs[0].originalTermIndexes.length==2);
106       Assert.assertTrue(cs[0].originalTermIndexes[0]==1);
107       Assert.assertTrue(cs[0].originalTermIndexes[1]==2);
108       Assert.assertTrue(cs[0].suggestion.string.equals("hundred"));
109       Assert.assertTrue(cs[0].suggestion.score==1);
110       
111       Assert.assertTrue(cs[1].originalTermIndexes.length==2);
112       Assert.assertTrue(cs[1].originalTermIndexes[0]==3);
113       Assert.assertTrue(cs[1].originalTermIndexes[1]==4);
114       Assert.assertTrue(cs[1].suggestion.string.equals("eighty"));
115       Assert.assertTrue(cs[1].suggestion.score==1);        
116       
117       Assert.assertTrue(cs[2].originalTermIndexes.length==2);
118       Assert.assertTrue(cs[2].originalTermIndexes[0]==4);
119       Assert.assertTrue(cs[2].originalTermIndexes[1]==5);
120       Assert.assertTrue(cs[2].suggestion.string.equals("yeight"));
121       Assert.assertTrue(cs[2].suggestion.score==1);
122       
123       for(int i=3 ; i<5 ; i++) {
124         Assert.assertTrue(cs[i].originalTermIndexes.length==3);
125         Assert.assertTrue(cs[i].suggestion.score==2);
126         Assert.assertTrue(
127             (cs[i].originalTermIndexes[0]==1 && 
128             cs[i].originalTermIndexes[1]==2 && 
129             cs[i].originalTermIndexes[2]==3 && 
130             cs[i].suggestion.string.equals("hundredeight")) ||
131             (cs[i].originalTermIndexes[0]==3 &&
132             cs[i].originalTermIndexes[1]==4 &&
133             cs[i].originalTermIndexes[2]==5 &&
134             cs[i].suggestion.string.equals("eightyeight"))
135             );
136       }     
137       
138       cs = wbsp.suggestWordCombinations(terms, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
139       Assert.assertTrue(cs.length==2);
140       Assert.assertTrue(cs[0].originalTermIndexes.length==2);
141       Assert.assertTrue(cs[0].suggestion.score==1);
142       Assert.assertTrue(cs[0].originalTermIndexes[0]==1);
143       Assert.assertTrue(cs[0].originalTermIndexes[1]==2);
144       Assert.assertTrue(cs[0].suggestion.string.equals("hundred"));
145       Assert.assertTrue(cs[0].suggestion.score==1);
146       
147       Assert.assertTrue(cs[1].originalTermIndexes.length==3);
148       Assert.assertTrue(cs[1].suggestion.score==2);
149       Assert.assertTrue(cs[1].originalTermIndexes[0] == 1);
150       Assert.assertTrue(cs[1].originalTermIndexes[1] == 2);
151       Assert.assertTrue(cs[1].originalTermIndexes[2] == 3);
152       Assert.assertTrue(cs[1].suggestion.string.equals("hundredeight"));
153     }
154     ir.close();
155   }  
156  
157   public void testBreakingWords() throws Exception {
158     IndexReader ir = DirectoryReader.open(dir);
159     WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
160     
161     {
162       Term term = new Term("numbers", "ninetynine");
163       wbsp.setMaxChanges(1);
164       wbsp.setMinBreakWordLength(1);
165       wbsp.setMinSuggestionFrequency(1);
166       SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
167       Assert.assertTrue(sw.length==1);
168       Assert.assertTrue(sw[0].length==2);
169       Assert.assertTrue(sw[0][0].string.equals("ninety"));
170       Assert.assertTrue(sw[0][1].string.equals("nine"));
171       Assert.assertTrue(sw[0][0].score == 1);
172       Assert.assertTrue(sw[0][1].score == 1);
173     }
174     {
175       Term term = new Term("numbers", "onethousand");
176       wbsp.setMaxChanges(1);
177       wbsp.setMinBreakWordLength(1);
178       wbsp.setMinSuggestionFrequency(1);
179       SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
180       Assert.assertTrue(sw.length==1);
181       Assert.assertTrue(sw[0].length==2);
182       Assert.assertTrue(sw[0][0].string.equals("one"));
183       Assert.assertTrue(sw[0][1].string.equals("thousand"));
184       Assert.assertTrue(sw[0][0].score == 1);
185       Assert.assertTrue(sw[0][1].score == 1);
186       
187       wbsp.setMaxChanges(2);
188       wbsp.setMinSuggestionFrequency(1);
189       sw = wbsp.suggestWordBreaks(term, 1, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
190       Assert.assertTrue(sw.length==1);
191       Assert.assertTrue(sw[0].length==2);
192       
193       wbsp.setMaxChanges(2);
194       wbsp.setMinSuggestionFrequency(2);
195       sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
196       Assert.assertTrue(sw.length==1);
197       Assert.assertTrue(sw[0].length==2);
198       
199       wbsp.setMaxChanges(2);
200       wbsp.setMinSuggestionFrequency(1);
201       sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
202       Assert.assertTrue(sw.length==2);
203       Assert.assertTrue(sw[0].length==2);
204       Assert.assertTrue(sw[0][0].string.equals("one"));
205       Assert.assertTrue(sw[0][1].string.equals("thousand"));
206       Assert.assertTrue(sw[0][0].score == 1);
207       Assert.assertTrue(sw[0][1].score == 1);
208       Assert.assertTrue(sw[0][1].freq>1);
209       Assert.assertTrue(sw[0][0].freq>sw[0][1].freq);
210       Assert.assertTrue(sw[1].length==3);
211       Assert.assertTrue(sw[1][0].string.equals("one"));
212       Assert.assertTrue(sw[1][1].string.equals("thou"));
213       Assert.assertTrue(sw[1][2].string.equals("sand"));
214       Assert.assertTrue(sw[1][0].score == 2);
215       Assert.assertTrue(sw[1][1].score == 2);
216       Assert.assertTrue(sw[1][2].score == 2);
217       Assert.assertTrue(sw[1][0].freq>1);
218       Assert.assertTrue(sw[1][1].freq==1);
219       Assert.assertTrue(sw[1][2].freq==1);
220     }
221     {
222       Term term = new Term("numbers", "onethousandonehundredeleven");
223       wbsp.setMaxChanges(3);
224       wbsp.setMinBreakWordLength(1);
225       wbsp.setMinSuggestionFrequency(1);
226       SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
227       Assert.assertTrue(sw.length==0);
228       
229       wbsp.setMaxChanges(4);
230       sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
231       Assert.assertTrue(sw.length==1);
232       Assert.assertTrue(sw[0].length==5);
233       
234       wbsp.setMaxChanges(5);
235       sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
236       Assert.assertTrue(sw.length==2);
237       Assert.assertTrue(sw[0].length==5);
238       Assert.assertTrue(sw[0][1].string.equals("thousand"));
239       Assert.assertTrue(sw[1].length==6);
240       Assert.assertTrue(sw[1][1].string.equals("thou"));
241       Assert.assertTrue(sw[1][2].string.equals("sand"));
242     }
243     {
244       //make sure we can handle 2-char codepoints
245       Term term = new Term("numbers", "\uD864\uDC79");
246       wbsp.setMaxChanges(1);
247       wbsp.setMinBreakWordLength(1);
248       wbsp.setMinSuggestionFrequency(1);
249       SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
250       Assert.assertTrue(sw.length==0);        
251     }
252     
253     ir.close();
254   }
255 
256   public void testRandom() throws Exception {
257     int numDocs = TestUtil.nextInt(random(), (10 * RANDOM_MULTIPLIER),
258         (100 * RANDOM_MULTIPLIER));
259     IndexReader ir = null;
260     
261     Directory dir = newDirectory();
262     Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
263     RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
264     int maxLength = TestUtil.nextInt(random(), 5, 50);
265     List<String> originals = new ArrayList<>(numDocs);
266     List<String[]> breaks = new ArrayList<>(numDocs);
267     for (int i = 0; i < numDocs; i++) {
268       String orig = "";
269       if (random().nextBoolean()) {
270         while (!goodTestString(orig)) {
271           orig = TestUtil.randomSimpleString(random(), maxLength);
272         }
273       } else {
274         while (!goodTestString(orig)) {
275           orig = TestUtil.randomUnicodeString(random(), maxLength);
276         }
277       }
278       originals.add(orig);
279       int totalLength = orig.codePointCount(0, orig.length());
280       int breakAt = orig.offsetByCodePoints(0,
281           TestUtil.nextInt(random(), 1, totalLength - 1));
282       String[] broken = new String[2];
283       broken[0] = orig.substring(0, breakAt);
284       broken[1] = orig.substring(breakAt);
285       breaks.add(broken);
286       Document doc = new Document();
287       doc.add(newTextField("random_break", broken[0] + " " + broken[1],
288           Field.Store.NO));
289       doc.add(newTextField("random_combine", orig, Field.Store.NO));
290       writer.addDocument(doc);
291     }
292     writer.commit();
293     writer.close();
294     
295     ir = DirectoryReader.open(dir);
296     WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
297     wbsp.setMaxChanges(1);
298     wbsp.setMinBreakWordLength(1);
299     wbsp.setMinSuggestionFrequency(1);
300     wbsp.setMaxCombineWordLength(maxLength);
301     for (int i = 0; i < originals.size(); i++) {
302       String orig = originals.get(i);
303       String left = breaks.get(i)[0];
304       String right = breaks.get(i)[1];
305       {
306         Term term = new Term("random_break", orig);
307         
308         SuggestWord[][] sw = wbsp.suggestWordBreaks(term, originals.size(),
309             ir, SuggestMode.SUGGEST_ALWAYS,
310             BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
311         boolean failed = true;
312         for (SuggestWord[] sw1 : sw) {
313           Assert.assertTrue(sw1.length == 2);
314           if (sw1[0].string.equals(left) && sw1[1].string.equals(right)) {
315             failed = false;
316           }
317         }
318         Assert.assertFalse("Failed getting break suggestions\n >Original: "
319             + orig + "\n >Left: " + left + "\n >Right: " + right, failed);
320       }
321       {
322         Term[] terms = {new Term("random_combine", left),
323             new Term("random_combine", right)};
324         CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms,
325             originals.size(), ir, SuggestMode.SUGGEST_ALWAYS);
326         boolean failed = true;
327         for (CombineSuggestion cs1 : cs) {
328           Assert.assertTrue(cs1.originalTermIndexes.length == 2);
329           if (cs1.suggestion.string.equals(left + right)) {
330             failed = false;
331           }
332         }
333         Assert.assertFalse("Failed getting combine suggestions\n >Original: "
334             + orig + "\n >Left: " + left + "\n >Right: " + right, failed);
335       }
336     }
337     IOUtils.close(ir, dir, analyzer);
338   }
339   
340   private static final Pattern mockTokenizerWhitespacePattern = Pattern
341       .compile("[ \\t\\r\\n]");
342   
343   private boolean goodTestString(String s) {
344     if (s.codePointCount(0, s.length()) < 2
345         || mockTokenizerWhitespacePattern.matcher(s).find()) {
346       return false;
347     }
348     return true;
349   }
350  }